/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.net; import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.util.logging.Logger; import org.apache.oro.text.regex.Perl5Compiler; import org.apache.oro.text.regex.Perl5Matcher; import org.apache.oro.text.regex.Perl5Pattern; import org.apache.oro.text.regex.Pattern; import org.apache.oro.text.regex.PatternMatcher; import org.apache.oro.text.regex.MalformedPatternException; import net.nutch.util.*; /** Filters URLs based on a file of regular expressions. The config file is * named by the Nutch configuration property "urlfilter.regex.file". * * <p>The format of this file is: * <pre> * [+-]<regex> * </pre> * where plus means go ahead and index it and minus means no. */ public class RegexURLFilter implements URLFilter { private static final Logger LOG = LogFormatter.getLogger("net.nutch.net.RegexURLFilter"); private static class Rule { public Perl5Pattern pattern; public boolean sign; public String regex; } private List rules; private PatternMatcher matcher = new Perl5Matcher(); public RegexURLFilter() throws IOException, MalformedPatternException { String file = NutchConf.get("urlfilter.regex.file"); Reader reader = NutchConf.getConfResourceAsReader(file); if (reader == null) { LOG.severe("Can't find resource: " + file); } else { rules=readConfigurationFile(reader); } } public RegexURLFilter(String filename) throws IOException, MalformedPatternException { rules = readConfigurationFile(new FileReader(filename)); } public synchronized String filter(String url) { Iterator i=rules.iterator(); while(i.hasNext()) { Rule r=(Rule) i.next(); if (matcher.contains(url,r.pattern)) { //System.out.println("Matched " + r.regex); return r.sign ? url : null; } }; return null; // assume no go } // // Format of configuration file is // // [+-]<regex> // // where plus means go ahead and index it and minus means no. // private static List readConfigurationFile(Reader reader) throws IOException, MalformedPatternException { BufferedReader in=new BufferedReader(reader); Perl5Compiler compiler=new Perl5Compiler(); List rules=new ArrayList(); String line; while((line=in.readLine())!=null) { if (line.length() == 0) continue; char first=line.charAt(0); boolean sign=false; switch (first) { case '+' : sign=true; break; case '-' : sign=false; break; case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; default : throw new IOException("Invalid first character: "+line); } String regex=line.substring(1); Rule rule=new Rule(); rule.pattern=(Perl5Pattern) compiler.compile(regex); rule.sign=sign; rule.regex=regex; rules.add(rule); } return rules; } public static void main(String args[]) throws IOException, MalformedPatternException { RegexURLFilter filter=new RegexURLFilter(); BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); String line; while((line=in.readLine())!=null) { String out=filter.filter(line); if(out!=null) { System.out.print("+"); System.out.println(out); } else { System.out.print("-"); System.out.println(line); } } } }